Source codes: https://github.com/Anne-LiseGerard/dftd_rnaseq
suppressPackageStartupMessages({
library("reshape2")
library("gplots")
library("DESeq2")
library("mitch")
library("limma")
library("kableExtra")
library("dplyr")
library("tidyr")
library("ComplexHeatmap")
library("RColorBrewer")
library("circlize")
library("pathview")
library("stringr")
library("grid")
library("VennDiagram")
library("enrichplot")
library("ggplot2")
library("viridis")
library("readxl")
library("cowplot")
library("grid")
})
knitr::opts_chunk$set(dev = 'svg')
The goal of our study is to compare the expression of genes related to metabolism of DFT1 cell lines to DFT2 cell lines. For this, we have generated transcriptomes for 3 DFT1 and 3 DFT2 cell lines, each in triplicate.
ss <- read.table("../ss.tsv",sep="\t",fill=TRUE,header=TRUE)
ss$DFT <- as.factor(ss$DFT)
ss$clone <- sapply(strsplit(ss$ClientID,"_"),"[[",1)
ss %>%
kbl(caption="Sample sheet for all samples") %>%
kable_paper("hover", full_width = F)
| C | ClientID | DFT | Replicate | Index1sequence | Index2sequence | X.ReadsIdentified.PF. | TotalReads | TotalBases.Gbases. | clone |
|---|---|---|---|---|---|---|---|---|---|
| 6180766 | 4906_1-1 | DFT1 | 1 | TTGTTGCA | GACGTCGT | 2.4465 | 103481165 | 15.5 | 4906 |
| 6180767 | C5065_1-1 | DFT1 | 1 | CCACACTT | CGTCATAC | 2.8116 | 118924031 | 17.8 | C5065 |
| 6180768 | 1426_1-1 | DFT1 | 1 | CCCGTTTG | TTGCTGTA | 2.3264 | 98401219 | 14.8 | 1426 |
| 6180769 | RV_1-1 | DFT2 | 1 | ATGCTCCC | CCCTGCTG | 2.2811 | 96485136 | 14.5 | RV |
| 6180770 | SN_1-1 | DFT2 | 1 | GCTCAATA | CATTTATC | 2.3947 | 101290147 | 15.2 | SN |
| 6180771 | TD549_1-1 | DFT2 | 1 | GTAGTTCG | CTTGATGC | 2.1732 | 91921221 | 13.8 | TD549 |
| 6180772 | 4906_2-1 | DFT1 | 2 | CGAGAACC | ATGTATCG | 2.2322 | 94416781 | 14.2 | 4906 |
| 6180773 | C5065_2-1 | DFT1 | 2 | GCCATGTA | ATAGGGTT | 2.4505 | 103650355 | 15.5 | C5065 |
| 6180774 | 1426_2-1 | DFT1 | 2 | TTTCTCTA | CTCGACGT | 2.4778 | 104805081 | 15.7 | 1426 |
| 6180775 | RV_2-1 | DFT2 | 2 | CCAGCGAT | TCTAGTCA | 2.9882 | 126393794 | 19.0 | RV |
| 6180776 | SN_2-1 | DFT2 | 2 | TGGGAGTG | CCCGTCTA | 2.4459 | 103455786 | 15.5 | SN |
| 6180777 | TD549_2-1 | DFT2 | 2 | CCCTCGTA | TAGAAGAG | 2.5437 | 107592495 | 16.1 | TD549 |
| 6180778 | 4906_3-1 | DFT1 | 3 | CGATATGG | GGGACGTA | 2.2711 | 96062159 | 14.4 | 4906 |
| 6180779 | C5065_3-1 | DFT1 | 3 | TTGTGCCC | TTTCCATC | 2.2763 | 96282107 | 14.4 | C5065 |
| 6180780 | 1426_3-1 | DFT1 | 3 | TGTCCTCT | CGACGAAC | 2.4952 | 105541059 | 15.8 | 1426 |
| 6180781 | RV_3-1 | DFT2 | 3 | GTATAGTC | TTGGTCTC | 2.3325 | 98659234 | 14.8 | RV |
| 6180782 | SN_3-1 | DFT2 | 3 | TTTGGGAT | GTTCACGT | 2.6750 | 113146174 | 17.0 | SN |
| 6180783 | TD549_3-1 | DFT2 | 3 | CACCAAGC | AAAGGGAA | 2.1610 | 91405190 | 13.7 | TD549 |
| DEA5_4NEG | DEA4_6NEG | NA | CGGAGAGG | CGAGCGTC | 0.0002 | 8460 | 0.0 | DEA4 |
We are interested in comparing all DFT1 samples against all DFT2 samples. We will use mSarHar 1.11 from Ensembl v109 for the reference transcriptome.
# volcanoplots
make_volcano <- function(de,name) {
sig <- subset(de,padj<0.05)
N_SIG=nrow(sig)
N_UP=nrow(subset(sig,log2FoldChange>0))
N_DN=nrow(subset(sig,log2FoldChange<0))
DET=nrow(de)
HEADER=paste(N_SIG,"@5%FDR,", N_UP, "up", N_DN, "dn", DET, "detected")
plot(de$log2FoldChange,-log10(de$padj),cex=1,pch=19,col="#D5D7E2",
main=name, xlab="log2 FC", ylab="-log10 pval")
mtext(HEADER)
grid()
points(sig$log2FoldChange,-log10(sig$padj),cex=1,pch=19,col="#5297A7")
}
# heatmaps
custom_heatmap <- function(zscore, fc, aveexpr, title) {
h1 <- Heatmap(zscore, cluster_rows = F,
column_labels = colnames(zscore), name="Z-score",
cluster_columns = T,
column_names_gp = gpar(fontsize = 7, fontface="bold"),
col=colorRamp2(c(-1.5,0,1.5), hcl_palette = "Blue-Red 2"),
heatmap_legend_param = list(title = "Z-score", at = seq(-1.5, 1.5,0.5)))
h2 <- Heatmap(fc, row_labels = rownames(fc), row_names_gp = gpar(fontsize = 6),
cluster_rows = F, name="logFC", col = col_logFC,column_names_gp = gpar(fontsize =7,fontface="bold"),
cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
grid.text(round(as.numeric(fc[i, j],2),2), x, y,gp=gpar(fontsize=6))
}, column_title = title,
heatmap_legend_param = list(title = "logFC", at = seq(-20, 20, 10)))
h <- h1 + h2
h
}
Here we load the data in from the aligner.
tmp <- read.table("../fastq/3col.tsv.gz")
x <- as.data.frame(acast(tmp, V2~V1, value.var="V3",fun.aggregate=sum))
dim(x)
## [1] 43512 19
Load gene names.
gn <- read.table("../ref/Sarcophilus_harrisii.mSarHar1.11.cdna+ncrna.genenames.tsv",fill=TRUE)
gn <- gn[order(gn$V1),]
dim(gn)
## [1] 43512 3
Load homology map.
hm <- read.table("../ref/mart_export_ensembl109_2023-07-14.txt",sep="\t",header=TRUE)
Now need to collapse transcript data to genes.
x$gene <- paste(gn$V2,gn$V3)
y <- aggregate(. ~ gene,x,sum)
rownames(y) <- y$gene
y$gene = NULL
dim(y)
## [1] 23829 19
Samples with <1M reads should be omitted. Will also round values to integers.
cs <- colSums(y)
cs <- cs[order(cs)]
par(mar=c(5,10,5,2))
barplot(cs,main="All samples",horiz=TRUE,las=1)
abline(v=1e7,col="red",lty=2)
y <- round(y)
This will help us to visualise the sources of variability in the overall dataset. Fix sample names. Plot MDS of all samples then plot MDS by DFT.
y <- y[,colnames(y) != "DEA5-4NEG"]
ss <- ss[ss$ClientID != "DEA4_6NEG",]
colnames(y) <- sapply(strsplit(ss$ClientID,"-"),"[[",1)
saveRDS(y, file = "y_clines.rds")
cs <- colSums(y)
cs <- cs[order(cs)]
par(mar=c(5,10,5,2))
barplot(cs,main="All samples",horiz=TRUE,las=1, xlab="reads")
cols <- ss$DFT
cols <- gsub("DFT1","#B3B9DF",cols)
cols <- gsub("DFT2","#DDAFB7",cols)
mymds <- plotMDS(y,plot=FALSE)
XMIN=min(mymds$x)
XMAX=max(mymds$x)
XMID=(XMAX+XMIN)/2
XMIN <- XMID + (XMIN-XMID)*1.1
XMAX <- XMID+(XMAX-XMID)*1.1
par(mar = c(5.1, 4.1, 4.1, 2.1) )
plotMDS(mymds,pch=19,cex=3,col=cols,main="Cell lines",xlim=c(XMIN,XMAX))
text(mymds,labels=colnames(y))
mtext("blue=DFT1, pink=DFT2")
y_DFT1 <- y[,colnames(y) %in% c("4906_1","4906_2","4906_3",
"1426_1","1426_2","1426_3",
"C5065_1","C5065_2","C5065_3")]
y_DFT2 <- y[,colnames(y) %in% c("RV_1","RV_2","RV_3",
"SN_1","SN_2","SN_3",
"TD549_1","TD549_2","TD549_3")]
par(mar = c(5.1, 4.1, 4.1, 2.1) )
mdsDFT1 <- plotMDS(y_DFT1,plot=FALSE)
plotMDS(mdsDFT1,pch=19,cex=3,col="#B3B9DF",main="MDS plot",xlim=c(XMIN,XMAX))
text(mdsDFT1,labels=colnames(y_DFT1))
mtext("DFT1")
par(mar = c(5.1, 4.1, 4.1, 2.1) )
mdsDFT2 <- plotMDS(y_DFT2,plot=FALSE)
plotMDS(mdsDFT2,pch=19,cex=3,col="#DDAFB7",main="MDS plot",xlim=c(XMIN,XMAX))
text(mdsDFT2, labels=colnames(y_DFT2))
mtext("DFT2")
Sum replicates. Plot top 40 DEGs.
x4906 <- rowSums(y[,ss$clone=="4906"])
xC5065 <- rowSums(y[,ss$clone=="C5065"])
x1426 <- rowSums(y[,ss$clone=="1426"])
xRV <- rowSums(y[,ss$clone=="RV"])
xSN <- rowSums(y[,ss$clone=="SN"])
xTD549 <- rowSums(y[,ss$clone=="TD549"])
y2 <- data.frame(x4906,xC5065,x1426,xRV,xSN,xTD549)
ss2 <- as.data.frame(colnames(y2))
colnames(ss2) <- "clone"
ss2$DFT <- factor(c("DFT1","DFT1","DFT1","DFT2","DFT2","DFT2"))
y2 <- y2[which(rowMeans(y2)>10),]
dim(y2)
## [1] 17468 6
dds <- DESeqDataSetFromMatrix(countData = y2 , colData = ss2, design = ~ DFT )
## converting counts to integer mode
dds2 <- dds
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
z<- results(res)
vsd <- vst(dds)
zz<-cbind(as.data.frame(z),assay(vsd))
dge<-as.data.frame(zz[order(zz$pvalue),])
dge2 <- dge
sig <- subset(dge,padj<0.05)
sig2 <- sig
sig2_up <- rownames(subset(sig,log2FoldChange>0))
sig2_dn <- rownames(subset(sig,log2FoldChange<0))
length(sig2_up)
## [1] 4200
length(sig2_dn)
## [1] 4412
# top 20 up & top 20 down
sig <- sig[order(sig$log2FoldChange, decreasing=TRUE),]
sig_noNA <- sig
sig_noNA$genes <- rownames(sig_noNA)
sig_noNA <- filter(sig_noNA, !grepl(' NA', genes))
top <- rbind(head(sig_noNA,20),tail(sig_noNA,20))
mx <- top[,7:ncol(top)] # get normalised counts
mx.scaled <- t(apply(mx[,1:6], 1, scale)) # center and scale each column (Z-score)
colnames(mx.scaled) <- colnames(mx[,1:6])
colnames(mx.scaled) <- c("4906", "C5065", "1426", "RV", "SN", "TD549")
log2FC <- as.matrix(top$log2FoldChange)
rownames(log2FC) <- rownames(top)
colnames(log2FC) <- "logFC"
mean <- as.matrix(top$baseMean)
rownames(mean) <- rownames(top)
colnames(mean) <- "AveExpr"
col_logFC <- colorRamp2(c(-20,0,20), hcl_palette = "Blue-Red 2")
ha <- HeatmapAnnotation(summary=anno_summary(gp=gpar(fill="#D5D7E2"),height=unit(2, "cm")))
rownames(mx.scaled) <- gsub("^.{0,21}", "", rownames(mx.scaled))
rownames(log2FC) <- gsub("^.{0,21}", "", rownames(log2FC))
top40 <- custom_heatmap(mx.scaled, log2FC, mean, title = "Cell lines")
top40
saveRDS(top40, "top40_clines.RDS")
make_volcano(dge2,"Cell lines")